This files contains an example of tuning a Logistic Regression model with BayesSearchCV
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.LogisticBayesianSearchSpace(random_state=42)
# pip install scikit-optimize
from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(data=X_train),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2, random_state=42),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 96.808 seconds; 1.6 minutes
print(bayes_search.best_score_)
0.7698249682368827
print(bayes_search.best_params_)
OrderedDict([('model', LogisticRegression(C=0.057464185559369926, max_iter=1000, random_state=42)), ('model__C', 0.057464185559369926), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results.best_score
0.7698249682368827
results.best_params
{'model': 'LogisticRegression()',
'C': 0.057464185559369926,
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe(num_rows=100, include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | encoder |
|---|---|---|---|---|---|---|---|
| 1 | 0.770 | 0.751 | 0.789 | 0.057 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 0.770 | 0.750 | 0.789 | 0.069 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 3 | 0.770 | 0.750 | 0.789 | 0.069 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 0.770 | 0.750 | 0.789 | 0.069 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 5 | 0.770 | 0.750 | 0.789 | 0.070 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 6 | 0.770 | 0.750 | 0.789 | 0.070 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 7 | 0.770 | 0.750 | 0.789 | 0.070 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 8 | 0.770 | 0.750 | 0.789 | 0.071 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 9 | 0.770 | 0.750 | 0.789 | 0.072 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 10 | 0.770 | 0.750 | 0.789 | 0.070 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 11 | 0.770 | 0.750 | 0.789 | 0.073 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 12 | 0.770 | 0.750 | 0.789 | 0.070 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 13 | 0.770 | 0.750 | 0.789 | 0.080 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 14 | 0.769 | 0.750 | 0.789 | 0.067 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 15 | 0.765 | 0.735 | 0.795 | 0.097 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 16 | 0.765 | 0.744 | 0.785 | 0.261 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 17 | 0.765 | 0.735 | 0.794 | 0.106 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 18 | 0.763 | 0.733 | 0.794 | 0.036 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 19 | 0.763 | 0.736 | 0.790 | 0.279 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 20 | 0.760 | 0.738 | 0.781 | 0.009 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 21 | 0.758 | 0.728 | 0.787 | 0.005 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 22 | 0.757 | 0.738 | 0.777 | <NA> | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 23 | 0.756 | 0.727 | 0.784 | 0.001 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 24 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 25 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 26 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 27 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 28 | 0.755 | 0.737 | 0.773 | 1.596 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 29 | 0.754 | 0.734 | 0.775 | 2.107 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 30 | 0.747 | 0.730 | 0.763 | 11.655 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 31 | 0.745 | 0.728 | 0.762 | 22.913 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 32 | 0.745 | 0.728 | 0.762 | 22.376 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 33 | 0.744 | 0.727 | 0.761 | 32.731 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 34 | 0.742 | 0.725 | 0.759 | 99.507 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 35 | 0.733 | 0.703 | 0.763 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 36 | 0.731 | 0.712 | 0.750 | 0.003 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 37 | 0.729 | 0.711 | 0.748 | 0.088 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 38 | 0.728 | 0.697 | 0.759 | 0.000 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 39 | 0.728 | 0.697 | 0.759 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | OneHotEncoder() |
| 40 | 0.725 | 0.704 | 0.747 | 0.000 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 41 | 0.725 | 0.707 | 0.743 | 3.489 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 42 | 0.725 | 0.707 | 0.743 | 6.797 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 43 | 0.725 | 0.707 | 0.743 | 96.252 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 44 | 0.725 | 0.707 | 0.743 | 99.347 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 45 | 0.725 | 0.706 | 0.744 | 0.403 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 46 | 0.722 | 0.700 | 0.745 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 47 | 0.722 | 0.699 | 0.744 | 0.000 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 48 | 0.712 | 0.689 | 0.734 | 0.023 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
| 49 | 0.705 | 0.682 | 0.728 | 0.001 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 50 | 0.699 | 0.675 | 0.724 | 0.000 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
| 51 | 0.699 | 0.675 | 0.724 | 0.000 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size=None, color='C').show()
results.plot_performance_across_trials(size='C', color='scaler').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=800, width=800 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='C',
color='scaler'
)
results.plot_score_vs_parameter(
parameter='C',
color='encoder'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 19 | 0.769825 | 0.057464 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 21 | 0.769730 | 0.068645 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 34 | 0.769730 | 0.068655 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 33 | 0.769694 | 0.069019 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 25 | 0.769680 | 0.069805 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'C': 'C',
'imputer': 'imputer',
'scaler': 'scaler',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.728
Model: OLS Adj. R-squared: 0.698
Method: Least Squares F-statistic: 23.61
Date: Mon, 14 Feb 2022 Prob (F-statistic): 1.88e-11
Time: 18:32:51 Log-Likelihood: 154.15
No. Observations: 50 AIC: -296.3
Df Residuals: 44 BIC: -284.8
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7188 0.005 153.899 0.000 0.709 0.728
imputer[T.SimpleImputer(strategy='median')] -0.0034 0.005 -0.641 0.525 -0.014 0.007
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0033 0.004 -0.843 0.404 -0.011 0.005
scaler[T.StandardScaler()] 0.0063 0.004 1.767 0.084 -0.001 0.013
encoder[T.OneHotEncoder()] 0.0375 0.004 9.553 0.000 0.030 0.045
C -3.878e-05 7.28e-05 -0.533 0.597 -0.000 0.000
==============================================================================
Omnibus: 18.462 Durbin-Watson: 0.841
Prob(Omnibus): 0.000 Jarque-Bera (JB): 22.437
Skew: -1.466 Prob(JB): 1.34e-05
Kurtosis: 4.476 Cond. No. 104.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'C'] ['imputer', 'scaler', 'encoder']
| roc_auc_Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 0 | 1.04166 | -0.335822 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 1 | 1.037159 | -0.335351 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 1.037159 | -0.33535 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 3 | 1.035465 | -0.335335 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 1.034805 | -0.335302 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['C'] = score_dataframe_transformed['C'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.728
Model: OLS Adj. R-squared: 0.698
Method: Least Squares F-statistic: 23.61
Date: Mon, 14 Feb 2022 Prob (F-statistic): 1.88e-11
Time: 18:32:51 Log-Likelihood: -38.745
No. Observations: 50 AIC: 89.49
Df Residuals: 44 BIC: 101.0
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept -1.3880 0.217 -6.403 0.000 -1.825 -0.951
imputer[T.SimpleImputer(strategy='median')] -0.1625 0.253 -0.641 0.525 -0.673 0.348
imputer[T.SimpleImputer(strategy='most_frequent')] -0.1546 0.183 -0.843 0.404 -0.524 0.215
scaler[T.StandardScaler()] 0.2966 0.168 1.767 0.084 -0.042 0.635
encoder[T.OneHotEncoder()] 1.7766 0.186 9.553 0.000 1.402 2.151
C -0.0436 0.082 -0.533 0.597 -0.208 0.121
==============================================================================
Omnibus: 18.462 Durbin-Watson: 0.841
Prob(Omnibus): 0.000 Jarque-Bera (JB): 22.437
Skew: -1.466 Prob(JB): 1.34e-05
Kurtosis: 4.476 Cond. No. 5.98
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | -0.162479 | 5.247883e-01 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | -0.154624 | 4.039380e-01 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.296624 | 8.408081e-02 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 1.776633 | 2.686761e-12 | True |
| C | C | -0.043564 | 5.967535e-01 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
forest = bayes_search.best_estimator_['model']
start_time = time.time()
result = permutation_importance(
bayes_search.best_estimator_, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 4.851 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.